; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX1
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX1
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX2
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX256DQ

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

@src64 = common global [8 x i64] zeroinitializer, align 64
@src32 = common global [16 x i32] zeroinitializer, align 64
@src16 = common global [32 x i16] zeroinitializer, align 64
@src8  = common global [64 x i8] zeroinitializer, align 64

@dst64 = common global [8 x double] zeroinitializer, align 64
@dst32 = common global [16 x float] zeroinitializer, align 64

;
; UITOFP to vXf64
;

define void @uitofp_2i64_2f64() #0 {
; CHECK-LABEL: @uitofp_2i64_2f64(
; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @src64, align 64
; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr @dst64, align 64
; CHECK-NEXT:    ret void
;
  %ld0 = load i64, ptr @src64, align 64
  %ld1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 1), align 8
  %cvt0 = uitofp i64 %ld0 to double
  %cvt1 = uitofp i64 %ld1 to double
  store double %cvt0, ptr @dst64, align 64
  store double %cvt1, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 1), align 8
  ret void
}

define void @uitofp_4i64_4f64() #0 {
; SSE-LABEL: @uitofp_4i64_4f64(
; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @src64, align 64
; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP2]], ptr @dst64, align 64
; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 2), align 16
; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i64> [[TMP3]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP4]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 2), align 16
; SSE-NEXT:    ret void
;
; AVX-LABEL: @uitofp_4i64_4f64(
; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @src64, align 64
; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
; AVX-NEXT:    store <4 x double> [[TMP2]], ptr @dst64, align 64
; AVX-NEXT:    ret void
;
  %ld0 = load i64, ptr @src64, align 64
  %ld1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 1), align 8
  %ld2 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 2), align 16
  %ld3 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 3), align 8
  %cvt0 = uitofp i64 %ld0 to double
  %cvt1 = uitofp i64 %ld1 to double
  %cvt2 = uitofp i64 %ld2 to double
  %cvt3 = uitofp i64 %ld3 to double
  store double %cvt0, ptr @dst64, align 64
  store double %cvt1, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 1), align 8
  store double %cvt2, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 2), align 16
  store double %cvt3, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 3), align 8
  ret void
}

define void @uitofp_8i64_8f64() #0 {
; SSE-LABEL: @uitofp_8i64_8f64(
; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @src64, align 64
; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP2]], ptr @dst64, align 64
; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 2), align 16
; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i64> [[TMP3]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP4]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 2), align 16
; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 4), align 32
; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i64> [[TMP5]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP6]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 32
; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 6), align 16
; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i64> [[TMP7]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP8]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 6), align 16
; SSE-NEXT:    ret void
;
; AVX256-LABEL: @uitofp_8i64_8f64(
; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @src64, align 64
; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
; AVX256-NEXT:    store <4 x double> [[TMP2]], ptr @dst64, align 64
; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 4), align 32
; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP3]] to <4 x double>
; AVX256-NEXT:    store <4 x double> [[TMP4]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 32
; AVX256-NEXT:    ret void
;
; AVX512-LABEL: @uitofp_8i64_8f64(
; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @src64, align 64
; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x double>
; AVX512-NEXT:    store <8 x double> [[TMP2]], ptr @dst64, align 64
; AVX512-NEXT:    ret void
;
  %ld0 = load i64, ptr @src64, align 64
  %ld1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 1), align 8
  %ld2 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 2), align 16
  %ld3 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 3), align 8
  %ld4 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 4), align 32
  %ld5 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 5), align 8
  %ld6 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 6), align 16
  %ld7 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 7), align 8
  %cvt0 = uitofp i64 %ld0 to double
  %cvt1 = uitofp i64 %ld1 to double
  %cvt2 = uitofp i64 %ld2 to double
  %cvt3 = uitofp i64 %ld3 to double
  %cvt4 = uitofp i64 %ld4 to double
  %cvt5 = uitofp i64 %ld5 to double
  %cvt6 = uitofp i64 %ld6 to double
  %cvt7 = uitofp i64 %ld7 to double
  store double %cvt0, ptr @dst64, align 64
  store double %cvt1, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 1), align 8
  store double %cvt2, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 2), align 16
  store double %cvt3, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 3), align 8
  store double %cvt4, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 32
  store double %cvt5, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 5), align 8
  store double %cvt6, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 6), align 16
  store double %cvt7, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 7), align 8
  ret void
}

define void @uitofp_2i32_2f64() #0 {
; SSE-LABEL: @uitofp_2i32_2f64(
; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @src32, align 64
; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP2]], ptr @dst64, align 64
; SSE-NEXT:    ret void
;
; AVX1-LABEL: @uitofp_2i32_2f64(
; AVX1-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 64
; AVX1-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 1), align 4
; AVX1-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
; AVX1-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
; AVX1-NEXT:    store double [[CVT0]], ptr @dst64, align 64
; AVX1-NEXT:    store double [[CVT1]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 1), align 8
; AVX1-NEXT:    ret void
;
; AVX2-LABEL: @uitofp_2i32_2f64(
; AVX2-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @src32, align 64
; AVX2-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
; AVX2-NEXT:    store <2 x double> [[TMP2]], ptr @dst64, align 64
; AVX2-NEXT:    ret void
;
; AVX512-LABEL: @uitofp_2i32_2f64(
; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @src32, align 64
; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
; AVX512-NEXT:    store <2 x double> [[TMP2]], ptr @dst64, align 64
; AVX512-NEXT:    ret void
;
; AVX256DQ-LABEL: @uitofp_2i32_2f64(
; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @src32, align 64
; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
; AVX256DQ-NEXT:    store <2 x double> [[TMP2]], ptr @dst64, align 64
; AVX256DQ-NEXT:    ret void
;
  %ld0 = load i32, ptr @src32, align 64
  %ld1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 1), align 4
  %cvt0 = uitofp i32 %ld0 to double
  %cvt1 = uitofp i32 %ld1 to double
  store double %cvt0, ptr @dst64, align 64
  store double %cvt1, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 1), align 8
  ret void
}

define void @uitofp_4i32_4f64() #0 {
; SSE-LABEL: @uitofp_4i32_4f64(
; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @src32, align 64
; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP2]], ptr @dst64, align 64
; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 2), align 8
; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i32> [[TMP3]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP4]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 2), align 16
; SSE-NEXT:    ret void
;
; AVX-LABEL: @uitofp_4i32_4f64(
; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 64
; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x double>
; AVX-NEXT:    store <4 x double> [[TMP2]], ptr @dst64, align 64
; AVX-NEXT:    ret void
;
  %ld0 = load i32, ptr @src32, align 64
  %ld1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 1), align 4
  %ld2 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 2), align 8
  %ld3 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 3), align 4
  %cvt0 = uitofp i32 %ld0 to double
  %cvt1 = uitofp i32 %ld1 to double
  %cvt2 = uitofp i32 %ld2 to double
  %cvt3 = uitofp i32 %ld3 to double
  store double %cvt0, ptr @dst64, align 64
  store double %cvt1, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 1), align 8
  store double %cvt2, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 2), align 16
  store double %cvt3, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 3), align 8
  ret void
}

define void @uitofp_8i32_8f64() #0 {
; SSE-LABEL: @uitofp_8i32_8f64(
; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @src32, align 64
; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP2]], ptr @dst64, align 64
; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 2), align 8
; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i32> [[TMP3]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP4]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 2), align 16
; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 4), align 16
; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i32> [[TMP5]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP6]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 32
; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 6), align 8
; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i32> [[TMP7]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP8]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 6), align 16
; SSE-NEXT:    ret void
;
; AVX256-LABEL: @uitofp_8i32_8f64(
; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 64
; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x double>
; AVX256-NEXT:    store <4 x double> [[TMP2]], ptr @dst64, align 64
; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 4), align 16
; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x double>
; AVX256-NEXT:    store <4 x double> [[TMP4]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 32
; AVX256-NEXT:    ret void
;
; AVX512-LABEL: @uitofp_8i32_8f64(
; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 64
; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x double>
; AVX512-NEXT:    store <8 x double> [[TMP2]], ptr @dst64, align 64
; AVX512-NEXT:    ret void
;
  %ld0 = load i32, ptr @src32, align 64
  %ld1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 1), align 4
  %ld2 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 2), align 8
  %ld3 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 3), align 4
  %ld4 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 4), align 16
  %ld5 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 5), align 4
  %ld6 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 6), align 8
  %ld7 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 7), align 4
  %cvt0 = uitofp i32 %ld0 to double
  %cvt1 = uitofp i32 %ld1 to double
  %cvt2 = uitofp i32 %ld2 to double
  %cvt3 = uitofp i32 %ld3 to double
  %cvt4 = uitofp i32 %ld4 to double
  %cvt5 = uitofp i32 %ld5 to double
  %cvt6 = uitofp i32 %ld6 to double
  %cvt7 = uitofp i32 %ld7 to double
  store double %cvt0, ptr @dst64, align 64
  store double %cvt1, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 1), align 8
  store double %cvt2, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 2), align 16
  store double %cvt3, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 3), align 8
  store double %cvt4, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 32
  store double %cvt5, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 5), align 8
  store double %cvt6, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 6), align 16
  store double %cvt7, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 7), align 8
  ret void
}

define void @uitofp_2i16_2f64() #0 {
; CHECK-LABEL: @uitofp_2i16_2f64(
; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr @src16, align 64
; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double>
; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr @dst64, align 64
; CHECK-NEXT:    ret void
;
  %ld0 = load i16, ptr @src16, align 64
  %ld1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 1), align 2
  %cvt0 = uitofp i16 %ld0 to double
  %cvt1 = uitofp i16 %ld1 to double
  store double %cvt0, ptr @dst64, align 64
  store double %cvt1, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 1), align 8
  ret void
}

define void @uitofp_4i16_4f64() #0 {
; SSE-LABEL: @uitofp_4i16_4f64(
; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr @src16, align 64
; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP2]], ptr @dst64, align 64
; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 2), align 4
; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i16> [[TMP3]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP4]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 2), align 16
; SSE-NEXT:    ret void
;
; AVX-LABEL: @uitofp_4i16_4f64(
; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @src16, align 64
; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x double>
; AVX-NEXT:    store <4 x double> [[TMP2]], ptr @dst64, align 64
; AVX-NEXT:    ret void
;
  %ld0 = load i16, ptr @src16, align 64
  %ld1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 1), align 2
  %ld2 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 2), align 4
  %ld3 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 3), align 2
  %cvt0 = uitofp i16 %ld0 to double
  %cvt1 = uitofp i16 %ld1 to double
  %cvt2 = uitofp i16 %ld2 to double
  %cvt3 = uitofp i16 %ld3 to double
  store double %cvt0, ptr @dst64, align 64
  store double %cvt1, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 1), align 8
  store double %cvt2, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 2), align 16
  store double %cvt3, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 3), align 8
  ret void
}

define void @uitofp_8i16_8f64() #0 {
; SSE-LABEL: @uitofp_8i16_8f64(
; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr @src16, align 64
; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP2]], ptr @dst64, align 64
; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 2), align 4
; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i16> [[TMP3]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP4]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 2), align 16
; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 4), align 8
; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i16> [[TMP5]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP6]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 32
; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 6), align 4
; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i16> [[TMP7]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP8]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 6), align 16
; SSE-NEXT:    ret void
;
; AVX256-LABEL: @uitofp_8i16_8f64(
; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @src16, align 64
; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x double>
; AVX256-NEXT:    store <4 x double> [[TMP2]], ptr @dst64, align 64
; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 4), align 8
; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x double>
; AVX256-NEXT:    store <4 x double> [[TMP4]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 32
; AVX256-NEXT:    ret void
;
; AVX512-LABEL: @uitofp_8i16_8f64(
; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @src16, align 64
; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x double>
; AVX512-NEXT:    store <8 x double> [[TMP2]], ptr @dst64, align 64
; AVX512-NEXT:    ret void
;
  %ld0 = load i16, ptr @src16, align 64
  %ld1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 1), align 2
  %ld2 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 2), align 4
  %ld3 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 3), align 2
  %ld4 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 4), align 8
  %ld5 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 5), align 2
  %ld6 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 6), align 4
  %ld7 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 7), align 2
  %cvt0 = uitofp i16 %ld0 to double
  %cvt1 = uitofp i16 %ld1 to double
  %cvt2 = uitofp i16 %ld2 to double
  %cvt3 = uitofp i16 %ld3 to double
  %cvt4 = uitofp i16 %ld4 to double
  %cvt5 = uitofp i16 %ld5 to double
  %cvt6 = uitofp i16 %ld6 to double
  %cvt7 = uitofp i16 %ld7 to double
  store double %cvt0, ptr @dst64, align 64
  store double %cvt1, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 1), align 8
  store double %cvt2, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 2), align 16
  store double %cvt3, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 3), align 8
  store double %cvt4, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 32
  store double %cvt5, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 5), align 8
  store double %cvt6, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 6), align 16
  store double %cvt7, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 7), align 8
  ret void
}

define void @uitofp_2i8_2f64() #0 {
; CHECK-LABEL: @uitofp_2i8_2f64(
; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, ptr @src8, align 64
; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr @dst64, align 64
; CHECK-NEXT:    ret void
;
  %ld0 = load i8, ptr @src8, align 64
  %ld1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 1), align 1
  %cvt0 = uitofp i8 %ld0 to double
  %cvt1 = uitofp i8 %ld1 to double
  store double %cvt0, ptr @dst64, align 64
  store double %cvt1, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 1), align 8
  ret void
}

define void @uitofp_4i8_4f64() #0 {
; SSE-LABEL: @uitofp_4i8_4f64(
; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i8>, ptr @src8, align 64
; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP2]], ptr @dst64, align 64
; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 2), align 2
; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i8> [[TMP3]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP4]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 2), align 16
; SSE-NEXT:    ret void
;
; AVX-LABEL: @uitofp_4i8_4f64(
; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr @src8, align 64
; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x double>
; AVX-NEXT:    store <4 x double> [[TMP2]], ptr @dst64, align 64
; AVX-NEXT:    ret void
;
  %ld0 = load i8, ptr @src8, align 64
  %ld1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 1), align 1
  %ld2 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 2), align 2
  %ld3 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 3), align 1
  %cvt0 = uitofp i8 %ld0 to double
  %cvt1 = uitofp i8 %ld1 to double
  %cvt2 = uitofp i8 %ld2 to double
  %cvt3 = uitofp i8 %ld3 to double
  store double %cvt0, ptr @dst64, align 64
  store double %cvt1, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 1), align 8
  store double %cvt2, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 2), align 16
  store double %cvt3, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 3), align 8
  ret void
}

define void @uitofp_8i8_8f64() #0 {
; SSE-LABEL: @uitofp_8i8_8f64(
; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i8>, ptr @src8, align 64
; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP2]], ptr @dst64, align 64
; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 2), align 2
; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i8> [[TMP3]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP4]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 2), align 16
; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 4), align 4
; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i8> [[TMP5]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP6]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 32
; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 6), align 2
; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i8> [[TMP7]] to <2 x double>
; SSE-NEXT:    store <2 x double> [[TMP8]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 6), align 16
; SSE-NEXT:    ret void
;
; AVX256-LABEL: @uitofp_8i8_8f64(
; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr @src8, align 64
; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x double>
; AVX256-NEXT:    store <4 x double> [[TMP2]], ptr @dst64, align 64
; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 4), align 4
; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP3]] to <4 x double>
; AVX256-NEXT:    store <4 x double> [[TMP4]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 32
; AVX256-NEXT:    ret void
;
; AVX512-LABEL: @uitofp_8i8_8f64(
; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @src8, align 64
; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x double>
; AVX512-NEXT:    store <8 x double> [[TMP2]], ptr @dst64, align 64
; AVX512-NEXT:    ret void
;
  %ld0 = load i8, ptr @src8, align 64
  %ld1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 1), align 1
  %ld2 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 2), align 2
  %ld3 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 3), align 1
  %ld4 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 4), align 4
  %ld5 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 5), align 1
  %ld6 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 6), align 2
  %ld7 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 7), align 1
  %cvt0 = uitofp i8 %ld0 to double
  %cvt1 = uitofp i8 %ld1 to double
  %cvt2 = uitofp i8 %ld2 to double
  %cvt3 = uitofp i8 %ld3 to double
  %cvt4 = uitofp i8 %ld4 to double
  %cvt5 = uitofp i8 %ld5 to double
  %cvt6 = uitofp i8 %ld6 to double
  %cvt7 = uitofp i8 %ld7 to double
  store double %cvt0, ptr @dst64, align 64
  store double %cvt1, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 1), align 8
  store double %cvt2, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 2), align 16
  store double %cvt3, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 3), align 8
  store double %cvt4, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 32
  store double %cvt5, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 5), align 8
  store double %cvt6, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 6), align 16
  store double %cvt7, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 7), align 8
  ret void
}

;
; UITOFP to vXf32
;

define void @uitofp_2i64_2f32() #0 {
; SSE-LABEL: @uitofp_2i64_2f32(
; SSE-NEXT:    [[LD0:%.*]] = load i64, ptr @src64, align 64
; SSE-NEXT:    [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 1), align 8
; SSE-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
; SSE-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
; SSE-NEXT:    store float [[CVT0]], ptr @dst32, align 64
; SSE-NEXT:    store float [[CVT1]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 1), align 4
; SSE-NEXT:    ret void
;
; AVX1-LABEL: @uitofp_2i64_2f32(
; AVX1-NEXT:    [[LD0:%.*]] = load i64, ptr @src64, align 64
; AVX1-NEXT:    [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 1), align 8
; AVX1-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
; AVX1-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
; AVX1-NEXT:    store float [[CVT0]], ptr @dst32, align 64
; AVX1-NEXT:    store float [[CVT1]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 1), align 4
; AVX1-NEXT:    ret void
;
; AVX2-LABEL: @uitofp_2i64_2f32(
; AVX2-NEXT:    [[LD0:%.*]] = load i64, ptr @src64, align 64
; AVX2-NEXT:    [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 1), align 8
; AVX2-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
; AVX2-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
; AVX2-NEXT:    store float [[CVT0]], ptr @dst32, align 64
; AVX2-NEXT:    store float [[CVT1]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 1), align 4
; AVX2-NEXT:    ret void
;
; AVX512-LABEL: @uitofp_2i64_2f32(
; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @src64, align 64
; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x float>
; AVX512-NEXT:    store <2 x float> [[TMP2]], ptr @dst32, align 64
; AVX512-NEXT:    ret void
;
; AVX256DQ-LABEL: @uitofp_2i64_2f32(
; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @src64, align 64
; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x float>
; AVX256DQ-NEXT:    store <2 x float> [[TMP2]], ptr @dst32, align 64
; AVX256DQ-NEXT:    ret void
;
  %ld0 = load i64, ptr @src64, align 64
  %ld1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 1), align 8
  %cvt0 = uitofp i64 %ld0 to float
  %cvt1 = uitofp i64 %ld1 to float
  store float %cvt0, ptr @dst32, align 64
  store float %cvt1, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 1), align 4
  ret void
}

define void @uitofp_4i64_4f32() #0 {
; CHECK-LABEL: @uitofp_4i64_4f32(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @src64, align 64
; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
; CHECK-NEXT:    store <4 x float> [[TMP2]], ptr @dst32, align 64
; CHECK-NEXT:    ret void
;
  %ld0 = load i64, ptr @src64, align 64
  %ld1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 1), align 8
  %ld2 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 2), align 16
  %ld3 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 3), align 8
  %cvt0 = uitofp i64 %ld0 to float
  %cvt1 = uitofp i64 %ld1 to float
  %cvt2 = uitofp i64 %ld2 to float
  %cvt3 = uitofp i64 %ld3 to float
  store float %cvt0, ptr @dst32, align 64
  store float %cvt1, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 1), align 4
  store float %cvt2, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 2), align 8
  store float %cvt3, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 3), align 4
  ret void
}

define void @uitofp_8i64_8f32() #0 {
; SSE-LABEL: @uitofp_8i64_8f32(
; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @src64, align 64
; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
; SSE-NEXT:    store <4 x float> [[TMP2]], ptr @dst32, align 64
; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 4), align 32
; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP3]] to <4 x float>
; SSE-NEXT:    store <4 x float> [[TMP4]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 4), align 16
; SSE-NEXT:    ret void
;
; AVX-LABEL: @uitofp_8i64_8f32(
; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @src64, align 64
; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
; AVX-NEXT:    store <8 x float> [[TMP2]], ptr @dst32, align 64
; AVX-NEXT:    ret void
;
  %ld0 = load i64, ptr @src64, align 64
  %ld1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 1), align 8
  %ld2 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 2), align 16
  %ld3 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 3), align 8
  %ld4 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 4), align 32
  %ld5 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 5), align 8
  %ld6 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 6), align 16
  %ld7 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @src64, i32 0, i64 7), align 8
  %cvt0 = uitofp i64 %ld0 to float
  %cvt1 = uitofp i64 %ld1 to float
  %cvt2 = uitofp i64 %ld2 to float
  %cvt3 = uitofp i64 %ld3 to float
  %cvt4 = uitofp i64 %ld4 to float
  %cvt5 = uitofp i64 %ld5 to float
  %cvt6 = uitofp i64 %ld6 to float
  %cvt7 = uitofp i64 %ld7 to float
  store float %cvt0, ptr @dst32, align 64
  store float %cvt1, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 1), align 4
  store float %cvt2, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 2), align 8
  store float %cvt3, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 3), align 4
  store float %cvt4, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 4), align 16
  store float %cvt5, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 5), align 4
  store float %cvt6, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 6), align 8
  store float %cvt7, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 7), align 4
  ret void
}

define void @uitofp_4i32_4f32() #0 {
; CHECK-LABEL: @uitofp_4i32_4f32(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 64
; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
; CHECK-NEXT:    store <4 x float> [[TMP2]], ptr @dst32, align 64
; CHECK-NEXT:    ret void
;
  %ld0 = load i32, ptr @src32, align 64
  %ld1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 1), align 4
  %ld2 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 2), align 8
  %ld3 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 3), align 4
  %cvt0 = uitofp i32 %ld0 to float
  %cvt1 = uitofp i32 %ld1 to float
  %cvt2 = uitofp i32 %ld2 to float
  %cvt3 = uitofp i32 %ld3 to float
  store float %cvt0, ptr @dst32, align 64
  store float %cvt1, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 1), align 4
  store float %cvt2, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 2), align 8
  store float %cvt3, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 3), align 4
  ret void
}

define void @uitofp_8i32_8f32() #0 {
; SSE-LABEL: @uitofp_8i32_8f32(
; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 64
; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
; SSE-NEXT:    store <4 x float> [[TMP2]], ptr @dst32, align 64
; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 4), align 16
; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
; SSE-NEXT:    store <4 x float> [[TMP4]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 4), align 16
; SSE-NEXT:    ret void
;
; AVX-LABEL: @uitofp_8i32_8f32(
; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 64
; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x float>
; AVX-NEXT:    store <8 x float> [[TMP2]], ptr @dst32, align 64
; AVX-NEXT:    ret void
;
  %ld0 = load i32, ptr @src32, align 64
  %ld1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 1), align 4
  %ld2 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 2), align 8
  %ld3 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 3), align 4
  %ld4 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 4), align 16
  %ld5 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 5), align 4
  %ld6 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 6), align 8
  %ld7 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 7), align 4
  %cvt0 = uitofp i32 %ld0 to float
  %cvt1 = uitofp i32 %ld1 to float
  %cvt2 = uitofp i32 %ld2 to float
  %cvt3 = uitofp i32 %ld3 to float
  %cvt4 = uitofp i32 %ld4 to float
  %cvt5 = uitofp i32 %ld5 to float
  %cvt6 = uitofp i32 %ld6 to float
  %cvt7 = uitofp i32 %ld7 to float
  store float %cvt0, ptr @dst32, align 64
  store float %cvt1, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 1), align 4
  store float %cvt2, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 2), align 8
  store float %cvt3, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 3), align 4
  store float %cvt4, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 4), align 16
  store float %cvt5, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 5), align 4
  store float %cvt6, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 6), align 8
  store float %cvt7, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 7), align 4
  ret void
}

define void @uitofp_16i32_16f32() #0 {
; SSE-LABEL: @uitofp_16i32_16f32(
; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 64
; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
; SSE-NEXT:    store <4 x float> [[TMP2]], ptr @dst32, align 64
; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 4), align 16
; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
; SSE-NEXT:    store <4 x float> [[TMP4]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 4), align 16
; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 8), align 32
; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i32> [[TMP5]] to <4 x float>
; SSE-NEXT:    store <4 x float> [[TMP6]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 8), align 32
; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 12), align 16
; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i32> [[TMP7]] to <4 x float>
; SSE-NEXT:    store <4 x float> [[TMP8]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 12), align 16
; SSE-NEXT:    ret void
;
; AVX256-LABEL: @uitofp_16i32_16f32(
; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 64
; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x float>
; AVX256-NEXT:    store <8 x float> [[TMP2]], ptr @dst32, align 64
; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 8), align 32
; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i32> [[TMP3]] to <8 x float>
; AVX256-NEXT:    store <8 x float> [[TMP4]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 8), align 32
; AVX256-NEXT:    ret void
;
; AVX512-LABEL: @uitofp_16i32_16f32(
; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @src32, align 64
; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i32> [[TMP1]] to <16 x float>
; AVX512-NEXT:    store <16 x float> [[TMP2]], ptr @dst32, align 64
; AVX512-NEXT:    ret void
;
  %ld0  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 0 ), align 64
  %ld1  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 1 ), align 4
  %ld2  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 2 ), align 8
  %ld3  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 3 ), align 4
  %ld4  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 4 ), align 16
  %ld5  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 5 ), align 4
  %ld6  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 6 ), align 8
  %ld7  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 7 ), align 4
  %ld8  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 8 ), align 32
  %ld9  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 9 ), align 4
  %ld10 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 10), align 8
  %ld11 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 11), align 4
  %ld12 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 12), align 16
  %ld13 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 13), align 4
  %ld14 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 14), align 8
  %ld15 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @src32, i32 0, i64 15), align 4
  %cvt0  = uitofp i32 %ld0  to float
  %cvt1  = uitofp i32 %ld1  to float
  %cvt2  = uitofp i32 %ld2  to float
  %cvt3  = uitofp i32 %ld3  to float
  %cvt4  = uitofp i32 %ld4  to float
  %cvt5  = uitofp i32 %ld5  to float
  %cvt6  = uitofp i32 %ld6  to float
  %cvt7  = uitofp i32 %ld7  to float
  %cvt8  = uitofp i32 %ld8  to float
  %cvt9  = uitofp i32 %ld9  to float
  %cvt10 = uitofp i32 %ld10 to float
  %cvt11 = uitofp i32 %ld11 to float
  %cvt12 = uitofp i32 %ld12 to float
  %cvt13 = uitofp i32 %ld13 to float
  %cvt14 = uitofp i32 %ld14 to float
  %cvt15 = uitofp i32 %ld15 to float
  store float %cvt0 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 0 ), align 64
  store float %cvt1 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 1 ), align 4
  store float %cvt2 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 2 ), align 8
  store float %cvt3 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 3 ), align 4
  store float %cvt4 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 4 ), align 16
  store float %cvt5 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 5 ), align 4
  store float %cvt6 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 6 ), align 8
  store float %cvt7 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 7 ), align 4
  store float %cvt8 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 8 ), align 32
  store float %cvt9 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 9 ), align 4
  store float %cvt10, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 10), align 8
  store float %cvt11, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 11), align 4
  store float %cvt12, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 12), align 16
  store float %cvt13, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 13), align 4
  store float %cvt14, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 14), align 8
  store float %cvt15, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 15), align 4
  ret void
}

define void @uitofp_4i16_4f32() #0 {
; CHECK-LABEL: @uitofp_4i16_4f32(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @src16, align 64
; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
; CHECK-NEXT:    store <4 x float> [[TMP2]], ptr @dst32, align 64
; CHECK-NEXT:    ret void
;
  %ld0 = load i16, ptr @src16, align 64
  %ld1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 1), align 2
  %ld2 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 2), align 4
  %ld3 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 3), align 2
  %cvt0 = uitofp i16 %ld0 to float
  %cvt1 = uitofp i16 %ld1 to float
  %cvt2 = uitofp i16 %ld2 to float
  %cvt3 = uitofp i16 %ld3 to float
  store float %cvt0, ptr @dst32, align 64
  store float %cvt1, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 1), align 4
  store float %cvt2, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 2), align 8
  store float %cvt3, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 3), align 4
  ret void
}

define void @uitofp_8i16_8f32() #0 {
; SSE-LABEL: @uitofp_8i16_8f32(
; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @src16, align 64
; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
; SSE-NEXT:    store <4 x float> [[TMP2]], ptr @dst32, align 64
; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 4), align 8
; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x float>
; SSE-NEXT:    store <4 x float> [[TMP4]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 4), align 16
; SSE-NEXT:    ret void
;
; AVX-LABEL: @uitofp_8i16_8f32(
; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @src16, align 64
; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x float>
; AVX-NEXT:    store <8 x float> [[TMP2]], ptr @dst32, align 64
; AVX-NEXT:    ret void
;
  %ld0 = load i16, ptr @src16, align 64
  %ld1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 1), align 2
  %ld2 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 2), align 4
  %ld3 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 3), align 2
  %ld4 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 4), align 8
  %ld5 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 5), align 2
  %ld6 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 6), align 4
  %ld7 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 7), align 2
  %cvt0 = uitofp i16 %ld0 to float
  %cvt1 = uitofp i16 %ld1 to float
  %cvt2 = uitofp i16 %ld2 to float
  %cvt3 = uitofp i16 %ld3 to float
  %cvt4 = uitofp i16 %ld4 to float
  %cvt5 = uitofp i16 %ld5 to float
  %cvt6 = uitofp i16 %ld6 to float
  %cvt7 = uitofp i16 %ld7 to float
  store float %cvt0, ptr @dst32, align 64
  store float %cvt1, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 1), align 4
  store float %cvt2, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 2), align 8
  store float %cvt3, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 3), align 4
  store float %cvt4, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 4), align 16
  store float %cvt5, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 5), align 4
  store float %cvt6, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 6), align 8
  store float %cvt7, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 7), align 4
  ret void
}

define void @uitofp_16i16_16f32() #0 {
; SSE-LABEL: @uitofp_16i16_16f32(
; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @src16, align 64
; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
; SSE-NEXT:    store <4 x float> [[TMP2]], ptr @dst32, align 64
; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 4), align 8
; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x float>
; SSE-NEXT:    store <4 x float> [[TMP4]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 4), align 16
; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 8), align 16
; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i16> [[TMP5]] to <4 x float>
; SSE-NEXT:    store <4 x float> [[TMP6]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 8), align 32
; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 12), align 8
; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i16> [[TMP7]] to <4 x float>
; SSE-NEXT:    store <4 x float> [[TMP8]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 12), align 16
; SSE-NEXT:    ret void
;
; AVX256-LABEL: @uitofp_16i16_16f32(
; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @src16, align 64
; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x float>
; AVX256-NEXT:    store <8 x float> [[TMP2]], ptr @dst32, align 64
; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 8), align 16
; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i16> [[TMP3]] to <8 x float>
; AVX256-NEXT:    store <8 x float> [[TMP4]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 8), align 32
; AVX256-NEXT:    ret void
;
; AVX512-LABEL: @uitofp_16i16_16f32(
; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @src16, align 64
; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i16> [[TMP1]] to <16 x float>
; AVX512-NEXT:    store <16 x float> [[TMP2]], ptr @dst32, align 64
; AVX512-NEXT:    ret void
;
  %ld0  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 0 ), align 64
  %ld1  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 1 ), align 2
  %ld2  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 2 ), align 4
  %ld3  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 3 ), align 2
  %ld4  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 4 ), align 8
  %ld5  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 5 ), align 2
  %ld6  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 6 ), align 4
  %ld7  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 7 ), align 2
  %ld8  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 8 ), align 16
  %ld9  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 9 ), align 2
  %ld10 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 10), align 4
  %ld11 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 11), align 2
  %ld12 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 12), align 8
  %ld13 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 13), align 2
  %ld14 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 14), align 4
  %ld15 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @src16, i32 0, i64 15), align 2
  %cvt0  = uitofp i16 %ld0  to float
  %cvt1  = uitofp i16 %ld1  to float
  %cvt2  = uitofp i16 %ld2  to float
  %cvt3  = uitofp i16 %ld3  to float
  %cvt4  = uitofp i16 %ld4  to float
  %cvt5  = uitofp i16 %ld5  to float
  %cvt6  = uitofp i16 %ld6  to float
  %cvt7  = uitofp i16 %ld7  to float
  %cvt8  = uitofp i16 %ld8  to float
  %cvt9  = uitofp i16 %ld9  to float
  %cvt10 = uitofp i16 %ld10 to float
  %cvt11 = uitofp i16 %ld11 to float
  %cvt12 = uitofp i16 %ld12 to float
  %cvt13 = uitofp i16 %ld13 to float
  %cvt14 = uitofp i16 %ld14 to float
  %cvt15 = uitofp i16 %ld15 to float
  store float %cvt0 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 0 ), align 64
  store float %cvt1 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 1 ), align 4
  store float %cvt2 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 2 ), align 8
  store float %cvt3 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 3 ), align 4
  store float %cvt4 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 4 ), align 16
  store float %cvt5 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 5 ), align 4
  store float %cvt6 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 6 ), align 8
  store float %cvt7 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 7 ), align 4
  store float %cvt8 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 8 ), align 32
  store float %cvt9 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 9 ), align 4
  store float %cvt10, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 10), align 8
  store float %cvt11, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 11), align 4
  store float %cvt12, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 12), align 16
  store float %cvt13, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 13), align 4
  store float %cvt14, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 14), align 8
  store float %cvt15, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 15), align 4
  ret void
}

define void @uitofp_4i8_4f32() #0 {
; CHECK-LABEL: @uitofp_4i8_4f32(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr @src8, align 64
; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
; CHECK-NEXT:    store <4 x float> [[TMP2]], ptr @dst32, align 64
; CHECK-NEXT:    ret void
;
  %ld0 = load i8, ptr @src8, align 64
  %ld1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 1), align 1
  %ld2 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 2), align 2
  %ld3 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 3), align 1
  %cvt0 = uitofp i8 %ld0 to float
  %cvt1 = uitofp i8 %ld1 to float
  %cvt2 = uitofp i8 %ld2 to float
  %cvt3 = uitofp i8 %ld3 to float
  store float %cvt0, ptr @dst32, align 64
  store float %cvt1, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 1), align 4
  store float %cvt2, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 2), align 8
  store float %cvt3, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 3), align 4
  ret void
}

define void @uitofp_8i8_8f32() #0 {
; SSE-LABEL: @uitofp_8i8_8f32(
; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr @src8, align 64
; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
; SSE-NEXT:    store <4 x float> [[TMP2]], ptr @dst32, align 64
; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 4), align 4
; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP3]] to <4 x float>
; SSE-NEXT:    store <4 x float> [[TMP4]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 4), align 16
; SSE-NEXT:    ret void
;
; AVX-LABEL: @uitofp_8i8_8f32(
; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @src8, align 64
; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x float>
; AVX-NEXT:    store <8 x float> [[TMP2]], ptr @dst32, align 64
; AVX-NEXT:    ret void
;
  %ld0 = load i8, ptr @src8, align 64
  %ld1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 1), align 1
  %ld2 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 2), align 2
  %ld3 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 3), align 1
  %ld4 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 4), align 4
  %ld5 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 5), align 1
  %ld6 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 6), align 2
  %ld7 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 7), align 1
  %cvt0 = uitofp i8 %ld0 to float
  %cvt1 = uitofp i8 %ld1 to float
  %cvt2 = uitofp i8 %ld2 to float
  %cvt3 = uitofp i8 %ld3 to float
  %cvt4 = uitofp i8 %ld4 to float
  %cvt5 = uitofp i8 %ld5 to float
  %cvt6 = uitofp i8 %ld6 to float
  %cvt7 = uitofp i8 %ld7 to float
  store float %cvt0, ptr @dst32, align 64
  store float %cvt1, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 1), align 4
  store float %cvt2, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 2), align 8
  store float %cvt3, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 3), align 4
  store float %cvt4, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 4), align 16
  store float %cvt5, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 5), align 4
  store float %cvt6, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 6), align 8
  store float %cvt7, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 7), align 4
  ret void
}

define void @uitofp_16i8_16f32() #0 {
; SSE-LABEL: @uitofp_16i8_16f32(
; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr @src8, align 64
; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
; SSE-NEXT:    store <4 x float> [[TMP2]], ptr @dst32, align 64
; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 4), align 4
; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP3]] to <4 x float>
; SSE-NEXT:    store <4 x float> [[TMP4]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 4), align 16
; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 8), align 8
; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i8> [[TMP5]] to <4 x float>
; SSE-NEXT:    store <4 x float> [[TMP6]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 8), align 32
; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 12), align 4
; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i8> [[TMP7]] to <4 x float>
; SSE-NEXT:    store <4 x float> [[TMP8]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 12), align 16
; SSE-NEXT:    ret void
;
; AVX256-LABEL: @uitofp_16i8_16f32(
; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @src8, align 64
; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x float>
; AVX256-NEXT:    store <8 x float> [[TMP2]], ptr @dst32, align 64
; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 8), align 8
; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i8> [[TMP3]] to <8 x float>
; AVX256-NEXT:    store <8 x float> [[TMP4]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 8), align 32
; AVX256-NEXT:    ret void
;
; AVX512-LABEL: @uitofp_16i8_16f32(
; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @src8, align 64
; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i8> [[TMP1]] to <16 x float>
; AVX512-NEXT:    store <16 x float> [[TMP2]], ptr @dst32, align 64
; AVX512-NEXT:    ret void
;
  %ld0  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 0 ), align 64
  %ld1  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 1 ), align 1
  %ld2  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 2 ), align 2
  %ld3  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 3 ), align 1
  %ld4  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 4 ), align 4
  %ld5  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 5 ), align 1
  %ld6  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 6 ), align 2
  %ld7  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 7 ), align 1
  %ld8  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 8 ), align 8
  %ld9  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 9 ), align 1
  %ld10 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 10), align 2
  %ld11 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 11), align 1
  %ld12 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 12), align 4
  %ld13 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 13), align 1
  %ld14 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 14), align 2
  %ld15 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @src8, i32 0, i64 15), align 1
  %cvt0  = uitofp i8 %ld0  to float
  %cvt1  = uitofp i8 %ld1  to float
  %cvt2  = uitofp i8 %ld2  to float
  %cvt3  = uitofp i8 %ld3  to float
  %cvt4  = uitofp i8 %ld4  to float
  %cvt5  = uitofp i8 %ld5  to float
  %cvt6  = uitofp i8 %ld6  to float
  %cvt7  = uitofp i8 %ld7  to float
  %cvt8  = uitofp i8 %ld8  to float
  %cvt9  = uitofp i8 %ld9  to float
  %cvt10 = uitofp i8 %ld10 to float
  %cvt11 = uitofp i8 %ld11 to float
  %cvt12 = uitofp i8 %ld12 to float
  %cvt13 = uitofp i8 %ld13 to float
  %cvt14 = uitofp i8 %ld14 to float
  %cvt15 = uitofp i8 %ld15 to float
  store float %cvt0 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 0 ), align 64
  store float %cvt1 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 1 ), align 4
  store float %cvt2 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 2 ), align 8
  store float %cvt3 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 3 ), align 4
  store float %cvt4 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 4 ), align 16
  store float %cvt5 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 5 ), align 4
  store float %cvt6 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 6 ), align 8
  store float %cvt7 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 7 ), align 4
  store float %cvt8 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 8 ), align 32
  store float %cvt9 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 9 ), align 4
  store float %cvt10, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 10), align 8
  store float %cvt11, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 11), align 4
  store float %cvt12, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 12), align 16
  store float %cvt13, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 13), align 4
  store float %cvt14, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 14), align 8
  store float %cvt15, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 15), align 4
  ret void
}

attributes #0 = { nounwind }
